#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on May 27, 2019
Modified on Dec 21, 2020

@author: gloria
"""

# PACKAGES #############################################################

import string
import re
import pandas as pd
import os
import numpy as np
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import stem_text
from nltk import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
translator = str.maketrans('', '', string.punctuation)
import random

# DICTIONARY ########################################################

people = ['peopl', 'tradit', 'tradition']
elite = ['cast', 'class', 'elit', 'elitist', 'establish', 'polit',
         'politic', 'politician', 'corrupt', 'regim', 'regimen', 'rule',
         'propaganda', 'directori', 'promin']
betray = ['arrog', 'arrogantli', 'betrai', 'treason', 'promis','shame', 'undemocrat', 'deceit', 'absurd', 'absurdli', 'admit', 'admitt']
direct = ['direct', 'directli', 'referendum']

# SELECT COMPUTER ###################################################

computer = 'glgennaro'  # work 
# computer = 'gloria' #personal


# PRESIDENTIAL 2016 #################################################

# Upload dataset 
os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/topics')
trump = pd.read_csv('all_punctuated_trump.csv', sep=',', encoding='utf-8')
trump = trump.drop('Unnamed: 0', 1)
trump['candidate'] = 'trump'

clinton = pd.read_csv('all_punctuated_clinton.csv', sep=',', encoding='utf-8')
clinton = clinton.drop('Unnamed: 0', 1)
clinton = clinton.drop('presidency', 1)
clinton['candidate'] = 'clinton'

# Select on which to work now
text = list(trump['text'])
# text = list(clinton['text'])

# Clean text
text = [i.strip("/") for i in text]
text=[re.sub(r'\.(?! )', '. ', re.sub(r' +', ' ', t)) for t in text]  #to add spaces after full stops. in the row text many mistakes 
text=[t.replace(']', '] ') for t in text]
text=[t.replace(':', ': ') for t in text]
text=[re.sub("[\[].*?[\]]", "", t) for t in text]
daeliminare = ['â€”','â€“','Â','“','-','--',] # add here all words we want to eliminare
for i in daeliminare:
    text=[t.replace(i, ' ') for t in text]
sep = 'APP Note:' # per eliminare i commenti a margine - le note finali dell'editore
text = [t.split(sep, 1)[0] for t in text]
sep = 'NOTE:'
text = [t.split(sep, 1)[0] for t in text]
sep = 'Citation:'
text = [t.split(sep, 1)[0] for t in text]
text = [re.sub(r'\[[^)]*\]', '', t) for t in text]
del sep


# Extract sentences
sent = [sent_tokenize(a) for a in text]
sent = [item for sublist in sent for item in sublist]
sent = [(a,remove_stopwords(a.translate(translator)).lower()) for a in sent]
sent = [a for a in sent if len(a[1].split())>=10]
sent = [(a[0], stem_text(a[1])) for a in sent]
df = pd.DataFrame(sent)

# Define Tf-Idf 
tfidf = TfidfVectorizer(min_df=0.01,
                        stop_words='english',
                        use_idf=True)

X_tfidf = tfidf.fit_transform(df[1])
feature_names = tfidf.get_feature_names()

# Calculate the People score
pop_people = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in people:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_people.append(p)
    
# Calculate the Elite score
pop_elite = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in elite:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_elite.append(p)
    
# Calculate the Betray score
pop_betray = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in betray:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_betray.append(p)

# Calculate the Direct score
pop_direct = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in direct:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_direct.append(p)


# Put all togeher to get the final score
df['direct_score'] = pop_direct
df['betray_score'] = pop_betray
df['elite_score'] = pop_elite
df['people_score'] = pop_people

df['pop'] = np.where( df[['direct_score', 'betray_score', 'elite_score', 'people_score']].all(axis=1)==0, 0, df[['direct_score', 'betray_score', 'elite_score', 'people_score']].sum(axis=1))

df['people'] = df[['direct_score', 'people_score']].sum(axis=1)
df['elite'] = df[['betray_score', 'elite_score']].sum(axis=1)
df['pop2'] = np.where(df[['people', 'elite']].all(axis=1)==0, 0, df[['people', 'elite']].sum(axis=1))

# Exctract the top and bottom
df = df.sort_values('pop2')
top10 = df.tail(11)[[0,'pop2']]
low10 = df.head(10)[[0,'pop2']]

# Round the scores and order them 
top10['pop2'] = [round(a, 3) for a in top10['pop2']]
top10= top10.sort_values(by=['pop2'], ascending=True)
low10['pop2'] = [round(a, 3) for a in low10['pop2']]
low10= low10.sort_values(by=['pop2'], ascending=True)

# Save
os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/output_text_analysis')
top10.to_csv('top_trump_sentences2.csv')
low10.to_csv('bottom_trump_sentences2.csv')


# CONGRESS 2018 ###############################################

# Upload texts
os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/dta/legislative')
df = pd.read_csv('legislative2018_mod.CSV', sep=',', encoding='windows-1252')
df = df.dropna()

# Extract sentences
text = list(df['text'])
sent = [sent_tokenize(a) for a in text]
sent = [item for sublist in sent for item in sublist]

# Clean sentences
sent = [(a, remove_stopwords(a.translate(translator))) for a in sent]
sent = [(a[0], a[1].lower()) for a in sent]
sent = [a for a in sent if len(a[1].split()) >= 10]
sent = [(a[0], stem_text(a[1])) for a in sent]
df = pd.DataFrame(sent)

# Define the Tf-Idf
tfidf = TfidfVectorizer(min_df=0.01,
                        stop_words='english',
                        use_idf=True)

X_tfidf = tfidf.fit_transform(df[1])
feature_names = tfidf.get_feature_names()

# Find the People score
pop_people = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in people:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_people.append(p)

# Find the Elite score
pop_elite = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in elite:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_elite.append(p)
   
# Find the betray score
pop_betray = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in betray:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_betray.append(p)
    
# Find the direct score
pop_direct = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in direct:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_direct.append(p)


# Put all together to find the final score
df['direct_score'] = pop_direct
df['betray_score'] = pop_betray
df['elite_score'] = pop_elite
df['people_score'] = pop_people

df['pop'] = np.where( df[['direct_score', 'betray_score', 'elite_score', 'people_score']].all(axis=1)==0, 0, df[['direct_score', 'betray_score', 'elite_score', 'people_score']].sum(axis=1))
df['people'] = df[['direct_score', 'people_score']].sum(axis=1)
df['elite'] = df[['betray_score', 'elite_score']].sum(axis=1)
df['pop2'] = np.where( df[['people', 'elite']].all(axis=1)==0, 0, df[['people', 'elite']].sum(axis=1))
print(len(df['pop2'][df['pop2']==0]))

# Sort and extract top anf bottom populist sentences
df = df.sort_values('pop2')
top10 = df.tail(11)[[0, 'pop2']]
low10 = df[df['pop2'] == 0]
low10 = low10.sample(10)
low10 = low10[[0, 'pop2']]

# Round populism scores
top10['pop2'] = [round(a, 3) for a in top10['pop2']]
top10 = top10.sort_values(by=['pop2'], ascending=True)
low10['pop2'] = [round(a, 3) for a in low10['pop2']]
low10 = low10.sort_values(by=['pop2'], ascending=True)

# Save
os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/output_text_analysis')
top10.to_csv('top_cong_sentences2018.csv')
low10.to_csv('bottom_cong_sentences2018.csv')

with pd.option_context("max_colwidth", 1000):
    low10.to_latex('low_cong_sentences2018.tex', float_format='%.3f', index=False)
    top10.to_latex('top_cong_sentences2018.tex', float_format='%.3f', index=False)



# CONGRESS 2020 ###############################################

# Upload texts
os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/Raw data/Congress 2020')

df = pd.read_excel('dataset_house.xlsx', sheet_name='txt')
df = df.loc[:, ~df.columns.str.startswith('Title_')]
selection = [a for a in list(df) if a.startswith('topic_')]
selection = df[selection].astype(str)
selection[selection == 'nan'] = ''
selection = selection.agg(' '.join, axis=1)

# Extract sentences
text = list(selection)
text = [a.replace('\n', '') for a in text]
sent = [sent_tokenize(a) for a in text]
sent = [item for sublist in sent for item in sublist]
del selection

# Clean sentences
sent = [(a, remove_stopwords(a.translate(translator))) for a in sent]
sent = [(a[0], a[1].lower()) for a in sent]
sent = [a for a in sent if len(a[1].split()) >= 10]
sent = [(a[0], stem_text(a[1])) for a in sent]
df = pd.DataFrame(sent)

# Define the Tf-Idf
tfidf = TfidfVectorizer(min_df=0.01,
                        stop_words='english',
                        use_idf=True)

X_tfidf = tfidf.fit_transform(df[1])
feature_names = tfidf.get_feature_names()

# Find the People score
pop_people = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in people:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_people.append(p)

# Find the Elite score
pop_elite = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in elite:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_elite.append(p)
   
# Find the betray score
pop_betray = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in betray:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_betray.append(p)
    
# Find the direct score
pop_direct = []
for i in range(len(df[1])):
    feature_index = X_tfidf[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [X_tfidf[i, x] for x in feature_index])
    l = []
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        if w in direct:
            l.append([w, s])
        else:
            continue
    p = sum(n for _, n in l)
    pop_direct.append(p)


# Put all together to find the final score
df['direct_score'] = pop_direct
df['betray_score'] = pop_betray
df['elite_score'] = pop_elite
df['people_score'] = pop_people

df['pop'] = np.where(df[['direct_score', 'betray_score', 'elite_score', 'people_score']].all(axis=1)==0, 0, df[['direct_score', 'betray_score', 'elite_score', 'people_score']].sum(axis=1))
df['people'] = df[['direct_score', 'people_score']].sum(axis=1)
df['elite'] = df[['betray_score', 'elite_score']].sum(axis=1)
df['pop2'] = np.where( df[['people', 'elite']].all(axis=1)==0, 0, df[['people', 'elite']].sum(axis=1))

# Sort and extract top anf bottom populist sentences
df = df.sort_values('pop2')
top10 = df.tail(11)[[0, 'pop2']]
low10 = df[df['pop2'] == 0]
low10 = low10.sample(10)
low10 = low10[[0, 'pop2']]

# Round populism scores
top10['pop2'] = [round(a, 3) for a in top10['pop2']]
top10 = top10.sort_values(by=['pop2'], ascending=True)
low10['pop2'] = [round(a, 3) for a in low10['pop2']]
low10 = low10.sort_values(by=['pop2'], ascending=True)

# Save
os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/output_text_analysis')
top10.to_csv('top_cong_sentences2020.csv')
low10.to_csv('bottom_cong_sentences2020.csv')

with pd.option_context("max_colwidth", 1000):
    low10.to_latex('low_cong_sentences2020.tex', float_format='%.3f', index=False)
    top10.to_latex('top_cong_sentences2020.tex', float_format='%.3f', index=False)




# # Wrtie them all in txt for evaluation 
# df1 = pd.read_csv('top_cong_sentences.CSV', sep=',')
# df1 = df1['0']
# df2 = pd.read_csv('bottom_cong_sentences.CSV', sep=',')
# df2 = df2['0']
# df3 = pd.read_csv('top_clinton_sentences.CSV', sep=',')
# df3 = df3['0']
# df4 = pd.read_csv('bottom_clinton_sentences.CSV', sep=',')
# df4 = df4['0']
# df5 = pd.read_csv('top_trump_sentences.CSV', sep=',')
# df5 = df5['0']
# df6 = pd.read_csv('bottom_trump_sentences.CSV', sep=',')
# df6 = df6['0']

# d = list(df1) + list(df2) + list(df3) + list(df4) + list(df5) + list(df6)
# random.shuffle(d)

# with open('all.txt', 'w') as f:
#     for item in d:
#         f.write("%s\n\n" % item)


